1 Introduction

Here, we will transfer the label and UMAP coordinates from the discovery cohort to the validation cohort. We will do it one compartment at a time (here, follicular dendritic cells, pdc), so we can validate that the transfer is robust.

Specifically, we will follow these steps for every compartment:

  1. Include or update the annotation label for this object.
  2. Plot the integrated object split by type (quey/reference) to visualize if discovery and validation cohorts were integrated properly.
  3. Transfer label.
  4. Transfer UMAP coordinates + visualize
  5. Exclude cells with a low annotation probability and/or a high doublet score
  6. Visualize heatmap of specific markers to validate that the cluster and annotation holds for the new cohort.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(tidyverse)
library(caret)
library(class)
library(harmony)
library(here)
library(glue)
library(DT)

2.2 Source functions

source(here("scRNA-seq/bin/SLOcatoR_functions.R"))
source(here("scRNA-seq/bin/utils.R"))
source(here("scRNA-seq/bin/utils_final_clusters.R"))
color_palette <-  c("#1CFFCE", "#90AD1C", "#C075A6", "#85660D", "#5A5156", "#AA0DFE",
                    "#F8A19F", "#F7E1A0", "#1C8356", "#FEAF16", "#822E1C", "#C4451C",
                    "#1CBE4F", "#325A9B", "#F6222E", "#FE00FA", "#FBE426", "#16FF32",
                    "black",   "#3283FE", "#B00068", "#DEA0FD", "#B10DA1", "#E4E1E3",
                    "#90AD1C", "#FE00FA", "#85660D", "#3B00FB", "#822E1C", "coral2",
                    "#1CFFCE", "#1CBE4F", "#3283FE", "#FBE426", "#F7E1A0", "#325A9B",
                    "#2ED9FF", "#B5EFB5", "#5A5156", "#DEA0FD", "#FEAF16", "#683B79",
                    "#B10DA1", "#1C7F93", "#F8A19F", "dark orange", "#FEAF16", "#FBE426",
                    "Brown")

3 PDC

# Read data, clean and include annotation
pdc <- readRDS(here("scRNA-seq/results/R_objects/seurat_objects_revision/4.3-PDC_seurat_object_discovery_validation_cohorts.rds"))
pdc <- updateAnnotation(pdc)
table(pdc$assay)
## 
##       3P multiome 
##     1114      275
pdc <- pdc[, pdc$assay == "3P"]
pdc$annotation_20230508 <- pdc$annotation_20220619
pdc$annotation_20230508[is.na(pdc$annotation_20230508)] <- "unannotated"
pdc$annotation_20230508 <- factor(
  pdc$annotation_20230508,
  levels = c(names(colors_20230508[["PDC"]]), "unannotated")
)
Idents(pdc) <- "annotation_20230508"
pdc$is_reference <- ifelse(
  pdc$cohort_type == "discovery",
  "reference",
  "query"
)
DimPlot(
  pdc,
  group.by = "annotation_20230508",
  split.by = "is_reference",
  cols = colors_20230508[["PDC"]]
)

DimPlot(
  pdc,
  group.by = "assay",
  split.by = "is_reference"
)

We see that we have too few multiome cells for a proper label transfer. Thus, we will label them as “pdc”, and focus on transering the label from the discovery scRNA-seq dataset to validation scRNA-seq. Let us integrate again this dataset:

Idents(pdc) <- "gem_id"
shared_hvg <- find_assay_specific_features(pdc, assay_var = "cohort_type")
print(glue("The number of shared HVG is: {length(shared_hvg)}"))
## The number of shared HVG is: 2189
table(pdc$cohort_type)
## 
##  discovery validation 
##        322        792
pdc <- integrate_assays(
  pdc,
  assay_var = "cohort_type",
  shared_hvg = shared_hvg
)
pdc <- RunUMAP(pdc, dims = 1:25, reduction = "harmony")
DimPlot(
  pdc,
  group.by = "annotation_20230508",
  split.by = "cohort_type",
  cols = colors_20230508[["PDC"]]
)

DimPlot(
  pdc,
  group.by = "cohort_type"
)

FeaturePlot(pdc, c("CD79B", "CD3G")) &
  scale_color_viridis_c(option = "magma")

pdc <- FindNeighbors(pdc, reduction = "harmony", dims = 1:25, k.param = 10)
pdc <- FindClusters(pdc, resolution = 0.8)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1114
## Number of edges: 20405
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.7701
## Number of communities: 11
## Elapsed time: 0 seconds
DimPlot(pdc, cols = color_palette)

We see that there are two clusters of doublets with B and T cells (5, 10). Let’s exclude them and reintegrate:

markers_5 <- FindMarkers(pdc, ident.1 = "5", only.pos = TRUE, logfc.threshold = 0.7)
DT::datatable(markers_5)
markers_10 <- FindMarkers(pdc, ident.1 = "10", only.pos = TRUE, logfc.threshold = 0.7)
DT::datatable(markers_10)
pdc <- pdc[, !(pdc$seurat_clusters %in% c("5", "10") & pdc$cohort_type == "validation")]
DimPlot(pdc)

table(pdc$cohort_type)
## 
##  discovery validation 
##        322        697
shared_hvg <- find_assay_specific_features(pdc, assay_var = "cohort_type")
print(glue("The number of shared HVG is: {length(shared_hvg)}"))
## The number of shared HVG is: 2227
table(pdc$cohort_type)
## 
##  discovery validation 
##        322        697
pdc <- integrate_assays(
  pdc,
  assay_var = "cohort_type",
  shared_hvg = shared_hvg
)
pdc <- RunUMAP(pdc, dims = 1:25, reduction = "harmony")
DimPlot(
  pdc,
  group.by = "annotation_20230508",
  split.by = "cohort_type",
  cols = colors_20230508[["PDC"]]
)

Now we can transfer the label and UMAP coords

# Split training and test sets
data_sets <- split_training_and_test_sets(
  pdc,
  split_var = "cohort_type",
  referece_label = "discovery",
  query_label = "validation",
  reduction = "harmony",
  n_dims = 25
)

# Transfer label
annotation_query_df <- transfer_label(
  seurat_obj = pdc,
  training_set = data_sets$training_set,
  test_set = data_sets$test_set,
  k = 8,
  response_var = "annotation_20230508"
)

# Transfer UMAP coords
# umap_test_df <- transfer_umap_coords(
#   seurat_obj = pdc,
#   training_set = data_sets$training_set,
#   test_set = data_sets$test_set,
#   umap1_var = "UMAP_1_20220215",
#   umap2_var = "UMAP_2_20220215",
#   k = 15
# )


# Include annotation and UMAP coords in Seurat object
pdc$annotation_20230508[annotation_query_df$query_cells] <- annotation_query_df$annotation
Idents(pdc) <- "annotation_20230508"
pdc$annotation_20230508_probability <- NA
pdc$annotation_20230508_probability[annotation_query_df$query_cells] <- annotation_query_df$annotation_prob
# pdc$UMAP_1_20220215[umap_test_df$query_cells] <- umap_test_df$UMAP1
# pdc$UMAP_2_20220215[umap_test_df$query_cells] <- umap_test_df$UMAP2
p1 <- FeaturePlot(
  pdc[, annotation_query_df$query_cells],
  "annotation_20230508_probability"
) + scale_color_viridis_c(option = "magma")
p2 <- FeaturePlot(
  pdc[, annotation_query_df$query_cells],
  "doublet_score_scDblFinder"
) + scale_color_viridis_c(option = "magma")
p1 | p2

DimPlot(
  pdc,
  group.by = "annotation_20230508",
  split.by = "cohort_type",
  cols = colors_20230508[["PDC"]]
)

table(pdc$annotation_20230508_probability)
## 
##   0.5 0.625  0.75 0.875     1 
##     9    23    48   148   469
FeaturePlot(pdc, c("nFeature_RNA", "pct_mt"))

We see that there is a cluster with hallmarks of poor-quality cells: high % of mitochondrial expression and low number of detected features. We will exclude it:

pdc <- FindNeighbors(pdc, k.param = 15, dims = 1:20, reduction = "harmony")
pdc <- FindClusters(pdc, resolution = 0.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 1019
## Number of edges: 40431
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8597
## Number of communities: 4
## Elapsed time: 0 seconds
DimPlot(pdc)

markers_2 <- FindMarkers(pdc, ident.1 = "2", only.pos = TRUE)
DT::datatable(markers_2)
pdc <- pdc[, !(pdc$cohort_type == "validation" & pdc$seurat_clusters == "2")]
DimPlot(pdc, cols = color_palette)

goi <- c("IRF8", "IL3RA", "LILRA4", "IFI44", "IFI6")
FeaturePlot(pdc, goi) &
  scale_color_viridis_c(option = "magma")

DimPlot(pdc, group.by = "annotation_20230508", cols = colors_20230508$PDC)

4 Save

saveRDS(pdc, here("scRNA-seq/results/R_objects/seurat_objects_revision/5.3-PDC_seurat_object_discovery_validation_cohorts_annotation.rds"))

5 Session Information

sessionInfo()
## R version 4.2.0 (2022-04-22)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C               LC_TIME=es_ES.UTF-8        LC_COLLATE=en_US.UTF-8     LC_MONETARY=es_ES.UTF-8    LC_MESSAGES=en_US.UTF-8    LC_PAPER=es_ES.UTF-8       LC_NAME=C                  LC_ADDRESS=C               LC_TELEPHONE=C             LC_MEASUREMENT=es_ES.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] DT_0.27            glue_1.6.2         here_1.0.1         harmony_0.1.1      Rcpp_1.0.10        class_7.3-21       caret_6.0-93       lattice_0.20-45    forcats_1.0.0      stringr_1.5.0      dplyr_1.1.0        purrr_1.0.1        readr_2.1.3        tidyr_1.3.0        tibble_3.1.8       ggplot2_3.4.0      tidyverse_1.3.2    SeuratObject_4.1.3 Seurat_4.3.0       BiocStyle_2.26.0  
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.4.1           backports_1.4.1        plyr_1.8.8             igraph_1.3.5           lazyeval_0.2.2         sp_1.6-0               splines_4.2.0          crosstalk_1.2.0        listenv_0.9.0          scattermore_0.8        digest_0.6.31          foreach_1.5.2          htmltools_0.5.4        fansi_1.0.4            magrittr_2.0.3         tensor_1.5             googlesheets4_1.0.1    cluster_2.1.4          ROCR_1.0-11            limma_3.54.1           tzdb_0.3.0             recipes_1.0.4          globals_0.16.2         gower_1.0.1            modelr_0.1.10          matrixStats_0.63.0     hardhat_1.2.0          timechange_0.2.0       spatstat.sparse_3.0-0  colorspace_2.1-0       rvest_1.0.3            ggrepel_0.9.3          haven_2.5.1            xfun_0.37              crayon_1.5.2           jsonlite_1.8.4         progressr_0.13.0       spatstat.data_3.0-0    iterators_1.0.14       survival_3.5-0         zoo_1.8-11             polyclip_1.10-4        gtable_0.3.1           gargle_1.3.0           ipred_0.9-13           leiden_0.4.3           future.apply_1.10.0    abind_1.4-5            scales_1.2.1           DBI_1.1.3              spatstat.random_3.1-3 
##  [52] miniUI_0.1.1.1         viridisLite_0.4.1      xtable_1.8-4           reticulate_1.28        stats4_4.2.0           lava_1.7.1             prodlim_2019.11.13     htmlwidgets_1.6.1      httr_1.4.4             RColorBrewer_1.1-3     ellipsis_0.3.2         ica_1.0-3              farver_2.1.1           pkgconfig_2.0.3        nnet_7.3-18            sass_0.4.5             uwot_0.1.14            dbplyr_2.3.2           deldir_1.0-6           utf8_1.2.3             labeling_0.4.2         tidyselect_1.2.0       rlang_1.0.6            reshape2_1.4.4         later_1.3.0            munsell_0.5.0          cellranger_1.1.0       tools_4.2.0            cachem_1.0.6           cli_3.6.0              generics_0.1.3         broom_1.0.3            ggridges_0.5.4         evaluate_0.20          fastmap_1.1.0          yaml_2.3.7             goftest_1.2-3          ModelMetrics_1.2.2.2   knitr_1.42             fs_1.6.0               fitdistrplus_1.1-8     RANN_2.6.1             pbapply_1.7-0          future_1.31.0          nlme_3.1-162           mime_0.12              xml2_1.3.3             compiler_4.2.0         rstudioapi_0.14        plotly_4.10.1          png_0.1-8             
## [103] spatstat.utils_3.0-1   reprex_2.0.2           bslib_0.4.2            stringi_1.7.12         highr_0.10             Matrix_1.5-3           vctrs_0.5.2            pillar_1.8.1           lifecycle_1.0.3        BiocManager_1.30.19    spatstat.geom_3.0-6    lmtest_0.9-40          jquerylib_0.1.4        RcppAnnoy_0.0.20       data.table_1.14.6      cowplot_1.1.1          irlba_2.3.5.1          httpuv_1.6.8           patchwork_1.1.2        R6_2.5.1               bookdown_0.32          promises_1.2.0.1       KernSmooth_2.23-20     gridExtra_2.3          parallelly_1.34.0      codetools_0.2-19       MASS_7.3-58.2          rprojroot_2.0.3        withr_2.5.0            sctransform_0.3.5      parallel_4.2.0         hms_1.1.2              rpart_4.1.19           timeDate_4022.108      grid_4.2.0             rmarkdown_2.20         googledrive_2.0.0      Rtsne_0.16             pROC_1.18.0            spatstat.explore_3.0-6 shiny_1.7.4            lubridate_1.9.1